Filter criteria:
Filter differentially expressed genes between autism and control (adj. p-value < 0.05 and lfc<log2(1.2))
No samples are removed based on network connectivity z-scores
Preprocessing:
VST Normalisation
DESeq2 DEA
# # Load csvs
# datExpr = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
# datMeta = read.csv('./../raw_data/RNAseq_ASD_datMeta.csv')
#
# # Make sure datExpr and datMeta columns/rows match
# rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
# if(!all(colnames(datExpr) == rownames(datMeta))){
# print('Columns in datExpr don\'t match the rowd in datMeta!')
# }
#
# # Annotate probes
# getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
# 'end_position','strand','band','gene_biotype','percentage_gc_content')
# mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
# dataset='hsapiens_gene_ensembl',
# host='feb2014.archive.ensembl.org') ## Gencode v19
# datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
# datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
# datProbes$length = datProbes$end_position-datProbes$start_position
#
# # Group brain regions by lobes
# datMeta$Brain_Region = as.factor(datMeta$Region)
# datMeta$Brain_lobe = 'Occipital'
# datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
# datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
# datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
# datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))
#
# #################################################################################
# # FILTERS:
#
# # 1 Filter probes with start or end position missing (filter 5)
# # These can be filtered without probe info, they have weird IDS that don't start with ENS
# to_keep = !is.na(datProbes$length)
# datProbes = datProbes[to_keep,]
# datExpr = datExpr[to_keep,]
# rownames(datProbes) = datProbes$ensembl_gene_id
#
# # 2. Filter samples from ID AN03345 (filter 2)
# to_keep = (datMeta$Subject_ID != 'AN03345')
# datMeta = datMeta[to_keep,]
# datExpr = datExpr[,to_keep]
#
# # 3. Filter samples with rowSums <= 40
# to_keep = rowSums(datExpr)>40
# datExpr = datExpr[to_keep,]
# datProbes = datProbes[to_keep,]
#
# if(!file.exists('./../working_data/genes_ASD_DE_info_DESeq2.csv')){
# counts = as.matrix(datExpr)
# rowRanges = GRanges(datProbes$chromosome_name,
# IRanges(datProbes$start_position, width=datProbes$length),
# strand=datProbes$strand,
# feature_id=datProbes$ensembl_gene_id)
#
# se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
# ddsSE = DESeqDataSet(se, design =~Diagnosis_)
#
# dds = DESeq(ddsSE)
# DE_info = results(dds) %>% data.frame %>% rownames_to_column(var = 'ID') %>%
# mutate('logFC_DESeq2'=log2FoldChange, 'adj.P.Val_DESeq2'=padj) %>%
# dplyr::select(ID, logFC_DESeq2, adj.P.Val_DESeq2)
#
# write.csv(DE_info_DESeq2, './../working_data/genes_ASD_DE_info_DESeq2.csv', row.names = FALSE)
#
# rm(counts, rowRanges, se, ddsSE, dds, mart)
#
# } else DE_info = read.csv('./../working_data/genes_ASD_DE_info_DESeq2.csv')
#
# save(file='./../working_data/RNAseq_ASD_4region_DEgenes_vst_DESeq2.Rdata', datExpr, datMeta, datProbes, DE_info)
load('./../working_data/RNAseq_ASD_4region_DEgenes_vst_DESeq2.Rdata')
datExpr_backup = datExpr
# Filter DE genes
datExpr = datExpr[DE_info$adj.P.Val_DESeq2<0.05 & DE_info$logFC_DESeq2>log2(1.2),]
rm(mart, gene_names, GO_annotations)
glue('Number of genes: ', nrow(datExpr), '\n',
'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 1726
## Number of samples: 86 (51 ASD, 35 controls)
reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.8, filter_controls=FALSE){
datExpr = data.frame(log2(datExpr+1))
if(filter_controls){
datMeta = datMeta %>% filter(Diagnosis_=='ASD')
datExpr = datExpr %>% select(paste0('X', datMeta_ASD$Dissected_Sample_ID))
}
datExpr_pca = prcomp(t(datExpr), scale=TRUE)
last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>%
filter(.[[2]] >= var_explained) %>% top_n(-1, ID)
par(mfrow=c(1,2))
plot(summary(datExpr_pca)$importance[2,], type='b')
abline(v=substr(last_pc$ID, 3, nchar(last_pc$ID)), col='blue')
plot(summary(datExpr_pca)$importance[3,], type='b')
abline(h=var_explained, col='blue')
print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
var_explained*100, '% of the variance'))
datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
return(list('datExpr'=datExpr_top_pc, 'datMeta'=datMeta, 'pca_output'=datExpr_pca))
}
reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)
## Keeping top 10 components that explain 80% of the variance
datExpr_redDim = reduce_dim_output$datExpr
datMeta_redDim = reduce_dim_output$datMeta
pca_output = reduce_dim_output$pca_output
rm(datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr, datMeta)
clusterings = list()
Chose k=3
set.seed(123)
wss = sapply(1:15, function(k) kmeans(datExpr_redDim, k, nstart=25)$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 3
abline(v = best_k, col='blue')
datExpr_k_means = kmeans(datExpr_redDim, best_k, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster
Chose k=6 as best number of clusters.
Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.
Colors:
Diagnosis: Blue=control, Green=ASD
Sex: Pink=Female, Blue=Male
Brain region: Pink=Frontal, Green=Temporal, Blue=Parietal, Purple=Occipital
Age: Purple=youngest, Yellow=oldest
h_clusts = datExpr_redDim %>% dist %>% hclust %>% as.dendrogram
# h_clusts %>% plot
best_k = 6
clusterings[['hc']] = cutree(h_clusts, best_k)
create_viridis_dict = function(){
min_age = datMeta_redDim$Age %>% min
max_age = datMeta_redDim$Age %>% max
viridis_age_cols = viridis(max_age - min_age + 1)
names(viridis_age_cols) = seq(min_age, max_age)
return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()
dend_meta = datMeta_redDim[match(labels(h_clusts), rownames(datMeta_redDim)),] %>%
mutate('Diagnosis' = ifelse(Diagnosis_=='CTL','#008080','#86b300'), # Blue control, Green ASD
'Sex' = ifelse(Sex=='F','#ff6666','#008ae6'), # Pink Female, Blue Male
'Region' = case_when(Brain_lobe=='Frontal'~'#F8766D', # ggplot defaults for 4 colours
Brain_lobe=='Temporal'~'#7CAE00',
Brain_lobe=='Parietal'~'#00BFC4',
Brain_lobe=='Occipital'~'#C77CFF'),
'Age' = viridis_age_cols[as.character(Age)]) %>% # Purple: young, Yellow: old
dplyr::select(Age, Region, Sex, Diagnosis)
h_clusts %>% set('labels', rep('', nrow(datMeta_redDim))) %>% set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)
Samples are grouped into two big clusters. It wasn’t clear which was the best number of subclusters for the first one, but the second one was clearer. Chose 6 and 8, respectively.
*Output plots in clustering_samples_03_20 folder
Following this paper’s guidelines:
Run PCA and keep enough components to explain 60% of the variance
Run ICA with that same number of nbComp as principal components kept to then filter them
Select components with kurtosis > 3
Assign obs to genes with FDR<0.01 using the fdrtool package
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
##
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
##
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
##
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
##
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
##
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
##
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
##
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
##
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
##
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
Not a good method for clustering samples because:
ICA does not perform well with small samples (see Figure 4 of this paper)
Warnings: (Warning in fdrtool(x, plot = F): There may be too few input test statistics for reliable FDR calculations!)
Leaves almost half of the observations (40) without a cluster, which is better than with the past datasets, but it’s still a lot:
ICA_clusters %>% rowSums %>% table
## .
## 0 1 2 4
## 10 60 14 2
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) +
geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
SFT.R.sq starts in 0.58 with power=1 but at 2 decreases to 0.28 and starts slowly growing. Best power=10.
best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(1, 26, by=1))
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.58400 1.3700 0.78700 42.500 46.100 56.50
## 2 2 0.24500 0.5540 0.26800 27.400 29.500 43.50
## 3 3 0.00435 0.0388 0.26700 19.500 20.300 35.30
## 4 4 0.16500 -0.1500 0.43100 14.600 14.600 29.50
## 5 5 0.51200 -0.2690 0.53200 11.400 10.900 25.20
## 6 6 0.57700 -0.4020 0.46800 9.140 8.270 21.80
## 7 7 0.54700 -0.5050 0.44400 7.490 6.430 19.20
## 8 8 0.68600 -0.5330 0.59700 6.230 5.060 17.00
## 9 9 0.79600 -0.5700 0.73900 5.260 4.000 15.30
## 10 10 0.87700 -0.5660 0.84300 4.500 3.230 13.80
## 11 11 0.65200 -0.6870 0.55300 3.880 2.650 12.50
## 12 12 0.14700 -2.1200 0.00313 3.380 2.190 11.40
## 13 13 0.15100 -2.2100 0.00511 2.970 1.820 10.50
## 14 14 0.15600 -2.2400 0.01440 2.620 1.530 9.70
## 15 15 0.15600 -2.1900 0.01510 2.330 1.290 8.99
## 16 16 0.16000 -2.2000 0.02180 2.090 1.090 8.40
## 17 17 0.16300 -2.2600 0.02430 1.870 0.934 7.87
## 18 18 0.16700 -2.2700 0.03280 1.690 0.800 7.39
## 19 19 0.17000 -2.2300 0.04570 1.530 0.689 6.97
## 20 20 0.17300 -2.2100 0.05770 1.400 0.599 6.59
## 21 21 0.18000 -2.2800 0.06360 1.280 0.528 6.24
## 22 22 0.18000 -2.2200 0.07330 1.170 0.466 5.92
## 23 23 0.18500 -2.2900 0.08100 1.080 0.411 5.64
## 24 24 0.22700 -3.3600 0.14400 0.992 0.363 5.37
## 25 25 0.23000 -3.2900 0.14600 0.917 0.320 5.13
## 26 26 0.23200 -3.2200 0.15000 0.850 0.282 4.90
network = datExpr_redDim %>% t %>% blockwiseModules(power=best_power$powerEstimate, numericLabels=TRUE)
## mergeCloseModules: less than two proper modules.
## ..color levels are 0, 1
## ..there is nothing to merge.
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)
It finds a single cluster grouping 72 observations and leaves the rest without cluster:
table(clusterings[['WGCNA']])
##
## 0 1
## 14 72
Points don’t seem to follow a Gaussian distribution no matter the number of clusters, chose 4 points following the best k from K-means because the methods are similar
n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=80, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')
best_k = 3 # copying k-means best_k
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)
Plot of clusters with their centroids in gray
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output,
ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, dend_meta,
best_power, c, viridis_age_cols, create_viridis_dict)
Using Adjusted Rand Index:
All clusterings are pretty similar except for WGCNA and ICA
GMM and Hierarchical clustering are the most similar
ICA no longer seems to be clustering individuals together
Consensus clustering is the one with the strongest relation to ASD
clusters_plus_phenotype = clusterings
clusters_plus_phenotype[['ICA_NA']] = is.na(clusters_plus_phenotype[['ICA_min']])
clusters_plus_phenotype[['Subject']] = datMeta_redDim$Subject_ID
clusters_plus_phenotype[['ASD']] = datMeta_redDim$Diagnosis_
clusters_plus_phenotype[['Region']] = datMeta_redDim$Brain_lobe
clusters_plus_phenotype[['Sex']] = datMeta_redDim$Sex
clusters_plus_phenotype[['Age']] = datMeta_redDim$Age
cluster_sim = data.frame(matrix(nrow = length(clusters_plus_phenotype), ncol = length(clusters_plus_phenotype)))
for(i in 1:(length(clusters_plus_phenotype))){
cluster1 = as.factor(clusters_plus_phenotype[[i]])
for(j in (i):length(clusters_plus_phenotype)){
cluster2 = as.factor(clusters_plus_phenotype[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusters_plus_phenotype)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(7,7))
rm(i, j, cluster1, cluster2, clusters_plus_phenotype, cluster_sim)
plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
mutate(ID = rownames(.), subject_ID = datMeta_redDim$Subject_ID,
km_clust = as.factor(clusterings[['km']]), hc_clust = as.factor(clusterings[['hc']]),
cc_l1_clust = as.factor(clusterings[['cc_l1']]), cc_clust = as.factor(clusterings[['cc_l2']]),
ica_clust = as.factor(clusterings[['ICA_min']]), n_ica_clust = as.factor(rowSums(ICA_clusters)),
gmm_clust = as.factor(clusterings[['GMM']]), wgcna_clust = as.factor(clusterings[['WGCNA']]),
sex = as.factor(datMeta_redDim$Sex), region = as.factor(datMeta_redDim$Brain_lobe),
diagnosis = as.factor(datMeta_redDim$Diagnosis_), age = datMeta_redDim$Age)
selectable_scatter_plot(plot_points, plot_points)